Setup - Import Libraries¶

In [1]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"


from tqdm.auto import tqdm

Load Data¶

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/wsovine/data/main/nhl_moneyline_model_logistic_regression_2022_season.csv')

Create Time Series Split¶

  1. Convert the spark dataframe to a pandas dataframe
  2. Obtain predictor and dependent variables
  3. Split into different time series buckets (~25 weeks in NHL season)
In [3]:
df.sort_values('game_date', inplace=True)
df.reset_index(inplace=True, drop=True)

X = df.copy()
X = X.drop(['home', 'away', 'game_date', 'season',
             'home_goals', 'away_goals', 'home_win'], axis=1)

y = df.home_win.values

print(X.shape)

tss = TimeSeriesSplit(n_splits=25)
(1316, 92)

Define Helper Functions¶

Functions to scale predictors and add one-hot encode categorical variables

In [4]:
def scale(X_train, X_test) -> (pd.DataFrame, pd.DataFrame):
    scaler = StandardScaler()
    X_train.loc[:,:] = scaler.fit_transform(X_train)
    X_test.loc[:,:] = scaler.transform(X_test)

    return (X_train, X_test)

def dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
    home_dummies = pd.get_dummies(original_df.home, prefix='home')
    X_train = X_train.join(home_dummies, how='inner')
    X_test = X_test.join(home_dummies, how='inner')

    away_dummies = pd.get_dummies(original_df.away, prefix='away')
    X_train = X_train.join(away_dummies, how='inner')
    X_test = X_test.join(away_dummies, how='inner')

    return (X_train, X_test)

def scale_and_dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
    X_train, X_test = scale(X_train, X_test)
    X_train, X_test = dummy(X_train, X_test, original_df)

    return (X_train, X_test)

Functions for calculations that involve the odds and calculating profit

In [5]:
# This function converts american odds to win probability
def odds_to_profit(odds, u = 1):
    if odds < 0:
        profit = -100 / odds
    else:
        profit = odds / 100
    return profit * u

def odds_to_loss(odds, u = 1):
    if odds < 0:
        loss = odds / -100
    else:
        loss = 100 / odds
    return -loss * u

def odds_to_prob(odds):
    if odds < 0:
        prob = odds / (odds - 100)
    else:
        prob = 100 / (odds + 100)
    return prob


def odds_to_dec(odds):
    if odds < 0:
        dec = (100 / -odds)
    else:
        dec = (odds / 100)
    return dec + 1

def profitize(df, aggregated=True, kelly_scale=100):
    df['away_win'] = np.where(df.home_win, 0, 1)

    df['home_decimal_odds'] = df.home_consensus.apply(lambda o: odds_to_dec(o))
    df['away_decimal_odds'] = df.away_consensus.apply(lambda o: odds_to_dec(o))

    # Determine the profit or loss for a bet on either team in each matchup
    df['bet_on_home'] = np.where(
        df.home_prob > df.home_implied_prob, 1, 0
        )
    df['bet_on_away'] = np.where(
        df.away_prob > df.away_implied_prob, 1, 0
        )

    # Flat 1u bet
    df['home_profit_if_win'] = df.home_consensus.apply(lambda o: odds_to_profit(o))
    df['away_profit_if_win'] = df.away_consensus.apply(lambda o: odds_to_profit(o))

    df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, -1)
    df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, -1)

    df['flat_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)

    # Bet to win 1u
    df['home_loss_if_lose'] = df.home_consensus.apply(lambda o: odds_to_loss(o))
    df['away_loss_if_lose'] = df.away_consensus.apply(lambda o: odds_to_loss(o))

    df['home_pnl_if_bet'] = np.where(df.home_win, 1, df.home_loss_if_lose)
    df['away_pnl_if_bet'] = np.where(df.away_win, 1, df.away_loss_if_lose)

    df['win_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)

    # Kelly Criterion
    df['home_kelly_bet'] = np.where(
        df.home_prob > df.home_implied_prob, 
        (df.home_prob - df.away_prob / df.home_decimal_odds) * kelly_scale,
        0)
    df['away_kelly_bet'] = np.where(
        df.away_prob > df.away_implied_prob, 
        (df.away_prob - df.home_prob / df.away_decimal_odds) * kelly_scale,
        0)
    
    df['home_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.home_consensus, r.home_kelly_bet), axis=1)
    df['away_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.away_consensus, r.away_kelly_bet), axis=1)

    df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, df.home_kelly_bet * -1)
    df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, df.away_kelly_bet * -1)

    df['kelly_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)

    if aggregated:
        return df.flat_1u_pnl.sum(), df.win_1u_pnl.sum(), df.kelly_pnl.sum()
    return df.flat_1u_pnl, df.win_1u_pnl, df.kelly_pnl

Obtain Accuracy Baseline¶

In [6]:
results = []

for i, (train_index, test_index) in enumerate(tss.split(X)):
    X_test = X.iloc[test_index]
    y_test = y[test_index]

    y_pred = np.where(X_test.home_implied_prob >= .5, 1, 0)
    y_prob = X_test.home_implied_prob

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    results.append({
        # 'Fold': i,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc
    })

df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")

display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
Average Accuracy: 0.5856
Average ROC AUC: 0.6478545892291949
  Accuracy ROC AUC
0 0.580000 0.579710
1 0.460000 0.620994
2 0.540000 0.553600
3 0.620000 0.678400
4 0.560000 0.613527
5 0.640000 0.765833
6 0.560000 0.652709
7 0.560000 0.643317
8 0.660000 0.734300
9 0.460000 0.651067
10 0.540000 0.600000
11 0.640000 0.728896
12 0.680000 0.649351
13 0.580000 0.491790
14 0.600000 0.696970
15 0.600000 0.603365
16 0.580000 0.597403
17 0.560000 0.706731
18 0.540000 0.590982
19 0.620000 0.698686
20 0.600000 0.660800
21 0.700000 0.790640
22 0.680000 0.770032
23 0.600000 0.788462
24 0.480000 0.328800

Logistic Regression¶

Model Using Time Series Split¶

In [7]:
results = []
coefs = []

for train_index, test_index in tqdm(tss.split(X), total=tss.n_splits):
    X_train = X.iloc[train_index].copy()
    X_test = X.iloc[test_index].copy()
    X_train, X_test = scale_and_dummy(X_train, X_test, df)

    y_train = y[train_index]
    y_test = y[test_index]

    logr = LogisticRegressionCV(solver='liblinear')
    logr.fit(X_train, y_train)

    y_pred = logr.predict(X_test)
    y_prob = logr.predict_proba(X_test)[:,1]

    df_test = df.iloc[test_index].copy()
    df_test['home_prob'] = y_prob
    df_test['away_prob'] = 1 - df_test.home_prob

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, kelly_scale=1)

    results.append({
        'C': logr.C_,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc,
        'Flat 1u PNL': flat_1u_pnl,
        'Win 1u PNL': win_1u_pnl,
        'Kelly PNL': kelly_pnl
    })

    coefs.append(
        pd.DataFrame(
            zip(logr.feature_names_in_, logr.coef_[0]), 
            columns=['feature', 'coef']
            )
        )

df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")
print(f"Flat 1u PNL: {df_results['Flat 1u PNL'].sum()}")
print(f"Win 1u PNL: {df_results['Win 1u PNL'].sum()}")
print(f"Kelly PNL: {df_results['Kelly PNL'].sum()}")

display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
  0%|          | 0/25 [00:00<?, ?it/s]
Average Accuracy: 0.6008
Average ROC AUC: 0.6459448800634778
Flat 1u PNL: 63.31012164469311
Win 1u PNL: 70.8473185263898
Kelly PNL: 20.06039080559668
  C Accuracy ROC AUC Flat 1u PNL Win 1u PNL Kelly PNL
0 [0.00077426] 0.560000 0.571659 3.179971 1.727962 0.825447
1 [0.0001] 0.500000 0.607372 8.410000 8.416261 1.998805
2 [0.0001] 0.460000 0.528000 6.550000 4.687297 1.932735
3 [0.0001] 0.580000 0.678400 -4.380000 -1.864863 -1.348194
4 [0.00599484] 0.580000 0.587762 9.110084 6.294694 1.816271
5 [0.00077426] 0.680000 0.745000 0.339476 3.111434 0.682678
6 [0.0001] 0.660000 0.666667 -1.897619 -1.908672 -0.698671
7 [0.00077426] 0.560000 0.631240 7.754195 5.508981 1.838118
8 [0.00077426] 0.660000 0.764895 -7.397860 -4.732032 -1.713640
9 [0.0001] 0.580000 0.655172 4.939091 3.742878 1.392340
10 [0.0001] 0.540000 0.595200 0.262381 0.155511 0.068563
11 [0.00077426] 0.680000 0.728896 2.237104 5.560010 1.294972
12 [0.04641589] 0.600000 0.608766 -3.787923 -0.934046 -1.156780
13 [0.00599484] 0.540000 0.512315 5.407887 1.059649 0.415893
14 [0.00599484] 0.620000 0.625668 9.004708 9.554287 2.976067
15 [0.00077426] 0.580000 0.604167 3.047963 -0.515959 0.298963
16 [0.00077426] 0.600000 0.599026 11.010140 7.040508 2.460569
17 [0.00077426] 0.620000 0.714744 -2.129691 2.967900 0.303268
18 [0.00599484] 0.580000 0.650564 12.687147 8.900983 4.069783
19 [0.00077426] 0.620000 0.722496 6.014087 7.815041 2.294935
20 [0.00077426] 0.600000 0.667200 -6.274944 -6.621247 -1.603295
21 [0.00077426] 0.760000 0.784893 -10.308423 -4.192636 -1.481131
22 [0.00599484] 0.720000 0.798077 0.022357 6.860426 1.188093
23 [0.00077426] 0.720000 0.777244 -1.691465 4.307596 0.951462
24 [0.00077426] 0.420000 0.323200 11.201457 3.905358 1.253140

Coefficients¶

In [8]:
pd.concat(coefs, axis=0).groupby('feature').agg(
    {'coef': ['mean', 'median', 'std']}
    ).sort_values(('coef', 'median'), key=abs, ascending=False).head()
Out[8]:
coef
mean median std
feature
home_goal_diff_offense_str -0.040254 -0.026278 0.047158
away_goal_diff_defense_str -0.030086 -0.018598 0.042356
home_implied_prob 0.019238 0.012753 0.026518
away_implied_prob -0.019514 -0.012598 0.027504
home_TwinSpires -0.017955 -0.012194 0.028835

Model Using Each Day¶

In [9]:
dfs = []
game_dates = df.groupby(['season', 'game_date']).count().reset_index()[['season', 'game_date']]
kelly_bankroll = 100

all_game_dates = tqdm(game_dates.itertuples(), total=game_dates.shape[0])

for scope in all_game_dates:
    y_train = y[(df.game_date < scope.game_date) & (df.season == scope.season)]

    if (len(y_train) >= 16 
        and y_train[y_train == 0].shape[0] > 5 
        and y_train[y_train == 1].shape[0] > 5):

        X_train = X[(df.game_date < scope.game_date) & (df.season == scope.season)].copy()
        df_test = df[df.game_date == scope.game_date]
        X_test = X.iloc[df_test.index].copy()

        df_test = df_test[['season', 'game_date', 'home', 'away', 'home_goals', 'away_goals', 'home_win', 'home_consensus', 'away_consensus', 'home_implied_prob', 'away_implied_prob']]
        df_test['away_win'] = np.where(df_test.home_win, 0, 1)

        X_train, X_test = scale_and_dummy(X_train, X_test, df)

        logr = LogisticRegression(C=.00077426, solver='liblinear')
        logr.fit(X_train, y_train)

        y_pred = logr.predict(X_test)
        y_prob = logr.predict_proba(X_test)[:,1]

        df_test['home_prob'] = y_prob
        df_test['away_prob'] = 1 - df_test.home_prob

        flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, aggregated=False, kelly_scale=100)
        df_test['flat_1u_pnl'] = flat_1u_pnl
        df_test['win_1u_pnl'] = win_1u_pnl
        df_test['kelly_pnl'] = kelly_pnl
        kelly_bankroll += kelly_pnl.sum()

        all_game_dates.set_description(f'{scope.game_date} | ${kelly_bankroll:.0f}')
                
        dfs.append(df_test)

df_test = pd.concat(dfs)
# display(df_test)
  0%|          | 0/220 [00:00<?, ?it/s]
In [10]:
df_pnl = df_test.copy()

df_pnl['game_date'] = pd.to_datetime(df_pnl.game_date, format='%Y%m%d')

df_pnl['cum_flat_1u_pnl'] = df_pnl.groupby('season').flat_1u_pnl.transform('cumsum')
df_pnl['cum_win_1u_pnl'] = df_pnl.groupby('season').win_1u_pnl.transform('cumsum')
df_pnl['cum_kelly_pnl'] = df_pnl.groupby('season').kelly_pnl.transform('cumsum')
In [11]:
fig = px.area(df_pnl, x='game_date', y='cum_kelly_pnl', color='season')
fig.update_yaxes(title='Profit / Loss')
fig.update_xaxes(title='Date')
In [ ]: